df <- read.csv('Airbnb_Milan.csv', sep = ',')
df
df_red <- df[c('host_is_superhost', 'host_identity_verified', 'bathrooms', 'bedrooms', 'daily_price', 'security_deposit', 'minimum_nights', 'number_of_reviews', 'review_scores_rating')]
df_red
# Aquí usé la función mutate con el parámetro recode, no la propia función recode, pero el resultado no debería variar...
# install.packages("tidyr")
# install.packages("dplyr")
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.1.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df_red <- df_red %>% mutate(host_is_superhost=recode(host_is_superhost, '0'='SI', '1'='NO')) # Ojo, no tener instalado libreria Car
df_red$host_is_superhost <- factor(df_red$host_is_superhost)
df_red
df_red <- df_red %>% mutate(host_identity_verified=recode(host_identity_verified, '0'='VERIFICA', '1'='NO VERIFICA'))
df_red$host_identity_verified <- factor(df_red$host_identity_verified)
df_red
summary(df_red)
## host_is_superhost host_identity_verified bathrooms bedrooms
## NO:2693 NO VERIFICA:4020 Min. : 1.000 Min. : 0.000
## SI:6629 VERIFICA :5302 1st Qu.: 3.000 1st Qu.: 1.000
## Median : 3.000 Median : 1.000
## Mean : 3.357 Mean : 1.218
## 3rd Qu.: 3.000 3rd Qu.: 2.000
## Max. :17.000 Max. :10.000
## daily_price security_deposit minimum_nights number_of_reviews
## Min. : 10.0 Min. : 1.00 Min. : 1.000 Min. : 1.00
## 1st Qu.: 59.0 1st Qu.: 1.00 1st Qu.: 1.000 1st Qu.: 4.00
## Median : 75.0 Median : 1.00 Median : 2.000 Median : 14.00
## Mean : 103.7 Mean : 21.71 Mean : 3.205 Mean : 39.64
## 3rd Qu.: 107.0 3rd Qu.: 48.00 3rd Qu.: 2.000 3rd Qu.: 44.00
## Max. :3000.0 Max. :143.00 Max. :365.000 Max. :791.00
## review_scores_rating
## Min. : 20.00
## 1st Qu.: 90.00
## Median : 95.00
## Mean : 93.15
## 3rd Qu.: 99.00
## Max. :100.00
df_red[df_red$minimum_nights<=7,] # No voy a usar más el dataframe filtrado para ejercicios próximos, usaré el sin filtrar
df_red %>% group_by(host_identity_verified) %>% summarise(mean = mean(daily_price))
Si el anfitrión tiene verificado el perfil es de 103.7127 y si no de 103.7647
df_red %>% group_by(host_is_superhost) %>% summarise(Media_resenas = mean(number_of_reviews), N_resenas_tot = sum(number_of_reviews))
Tiene más número de reseñas, en media, un no superhost.
df_red %>% group_by(host_is_superhost) %>% summarise(N_resenas = mean(review_scores_rating))
Curiosamente tiene más valoración un no superhost, lo cual es curioso…
df_red[df_red$review_scores_rating<50,'CATEGORÍA'] = 'NO ACONSEJABLE'
df_red[(49<df_red$review_scores_rating) & (df_red$review_scores_rating<76),'CATEGORÍA'] = 'ESTÁNDAR'
df_red[(75<df_red$review_scores_rating) & (df_red$review_scores_rating<101),'CATEGORÍA'] = 'TOP'
df_red
table(df_red['CATEGORÍA'])
##
## ESTÁNDAR NO ACONSEJABLE TOP
## 263 42 9017
hist(df_red[,'daily_price'], main = 'Histograma del precio por día', xlab = 'Precio por día')
# install.packages("ggplot2")
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.3
ggplot(df_red) +
geom_point(mapping = aes(bathrooms, bedrooms)) +
geom_smooth(mapping = aes(bathrooms, bedrooms), method = 'lm') +
ggtitle('Relación dormitorios y baños')
## `geom_smooth()` using formula = 'y ~ x'
Se puede observar que son en cierto modo directamente proporcionales. (Quizá hay un modelo que trabaje mejor esta relación).
ggplot(df_red, mapping = aes(number_of_reviews)) +
geom_histogram(aes(fill=host_identity_verified), bins=10)+
facet_wrap( ~ host_identity_verified)
ggplot(df_red, mapping = aes(security_deposit)) +
geom_histogram(mapping = aes(security_deposit, fill = host_is_superhost), color = "black", position = 'dodge', bins=10)+
facet_wrap( ~ CATEGORÍA)
df_top = df_red[df_red$CATEGORÍA == 'TOP',]
df_est = df_red[df_red$CATEGORÍA == 'ESTÁNDAR',]
df_no_ac = df_red[df_red$CATEGORÍA == 'NO ACONSEJABLE',]
ggplot(df_top) +
geom_histogram(mapping = aes(security_deposit, fill = host_is_superhost), color = "red", position = 'dodge' ) +
ggtitle('Categoría = TOP')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(df_est) +
geom_histogram(mapping = aes(security_deposit, fill = host_is_superhost), color = "red", position = 'dodge' ) +
ggtitle('Categoría = Estándar')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(df_no_ac) +
geom_histogram(mapping = aes(security_deposit, fill = host_is_superhost), color = "red", position = 'dodge' ) +
ggtitle('Categoría = No aconsejable')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.